# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import missingno as msno
# loading the dataset to a Pandas DataFrame
air_quality_data = pd.read_csv('AirQuality.csv')
-------------------------------First observations------------------------------------
# number of rows & columns in the dataset
air_quality_data.shape
(9471, 1)
# printing the first 5 rows of the dataset
air_quality_data.head()
| Date;Time;CO(GT);PT08.S1(CO);NMHC(GT);C6H6(GT);PT08.S2(NMHC);NOx(GT);PT08.S3(NOx);NO2(GT);PT08.S4(NO2);PT08.S5(O3);T;RH;AH;; | |||||
|---|---|---|---|---|---|
| 10/03/2004;18.00.00;2 | 6;1360;150;11 | 9;1046;166;1056;113;1692;1268;13 | 6;48 | 9;0 | 7578;; |
| 10/03/2004;19.00.00;2;1292;112;9 | 4;955;103;1174;92;1559;972;13 | 3;47 | 7;0 | 7255;; | NaN |
| 10/03/2004;20.00.00;2 | 2;1402;88;9 | 0;939;131;1140;114;1555;1074;11 | 9;54 | 0;0 | 7502;; |
| 10/03/2004;21.00.00;2 | 2;1376;80;9 | 2;948;172;1092;122;1584;1203;11 | 0;60 | 0;0 | 7867;; |
| 10/03/2004;22.00.00;1 | 6;1272;51;6 | 5;836;131;1205;116;1490;1110;11 | 2;59 | 6;0 | 7888;; |
-------------------------------Data Cleaning------------------------------------
Step 1 - All the values in the css file are separated by ‘;’ and Few columns have commas in the place of the decimal points
# specifying the semicolon as the delimiter
air_quality_data = pd.read_csv('AirQuality.csv', sep=';', decimal=',')
# first 5 rows of the dataset
air_quality_data.head()
| Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | Unnamed: 15 | Unnamed: 16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10/03/2004 | 18.00.00 | 2.6 | 1360.0 | 150.0 | 11.9 | 1046.0 | 166.0 | 1056.0 | 113.0 | 1692.0 | 1268.0 | 13.6 | 48.9 | 0.7578 | NaN | NaN |
| 1 | 10/03/2004 | 19.00.00 | 2.0 | 1292.0 | 112.0 | 9.4 | 955.0 | 103.0 | 1174.0 | 92.0 | 1559.0 | 972.0 | 13.3 | 47.7 | 0.7255 | NaN | NaN |
| 2 | 10/03/2004 | 20.00.00 | 2.2 | 1402.0 | 88.0 | 9.0 | 939.0 | 131.0 | 1140.0 | 114.0 | 1555.0 | 1074.0 | 11.9 | 54.0 | 0.7502 | NaN | NaN |
| 3 | 10/03/2004 | 21.00.00 | 2.2 | 1376.0 | 80.0 | 9.2 | 948.0 | 172.0 | 1092.0 | 122.0 | 1584.0 | 1203.0 | 11.0 | 60.0 | 0.7867 | NaN | NaN |
| 4 | 10/03/2004 | 22.00.00 | 1.6 | 1272.0 | 51.0 | 6.5 | 836.0 | 131.0 | 1205.0 | 116.0 | 1490.0 | 1110.0 | 11.2 | 59.6 | 0.7888 | NaN | NaN |
Step 2 - Removing the last 2 columns from the dataframe
#removing the last 2 columns from the dataframe
air_quality_data = air_quality_data.iloc[:, :-2]
# printing the first 5 rows of the dataset
air_quality_data.head()
| Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10/03/2004 | 18.00.00 | 2.6 | 1360.0 | 150.0 | 11.9 | 1046.0 | 166.0 | 1056.0 | 113.0 | 1692.0 | 1268.0 | 13.6 | 48.9 | 0.7578 |
| 1 | 10/03/2004 | 19.00.00 | 2.0 | 1292.0 | 112.0 | 9.4 | 955.0 | 103.0 | 1174.0 | 92.0 | 1559.0 | 972.0 | 13.3 | 47.7 | 0.7255 |
| 2 | 10/03/2004 | 20.00.00 | 2.2 | 1402.0 | 88.0 | 9.0 | 939.0 | 131.0 | 1140.0 | 114.0 | 1555.0 | 1074.0 | 11.9 | 54.0 | 0.7502 |
| 3 | 10/03/2004 | 21.00.00 | 2.2 | 1376.0 | 80.0 | 9.2 | 948.0 | 172.0 | 1092.0 | 122.0 | 1584.0 | 1203.0 | 11.0 | 60.0 | 0.7867 |
| 4 | 10/03/2004 | 22.00.00 | 1.6 | 1272.0 | 51.0 | 6.5 | 836.0 | 131.0 | 1205.0 | 116.0 | 1490.0 | 1110.0 | 11.2 | 59.6 | 0.7888 |
Step 3 - Removing the last rows from the dataframe which has null values
# printing the last 5 rows of the dataset
air_quality_data.tail()
| Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9466 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 9467 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 9468 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 9469 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 9470 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# number of rows & columns in the dataset
air_quality_data.shape
(9471, 15)
9356 represents the last data point in the data frame and remaining rows are just null values Index = 9356, Since python numbering starts from 0, Row = 9357th row Take the first 9357 rows from the data frame.
# select the row at index 9356.
air_quality_data.loc[[9356]]
| Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9356 | 04/04/2005 | 14.00.00 | 2.2 | 1071.0 | -200.0 | 11.9 | 1047.0 | 265.0 | 654.0 | 168.0 | 1129.0 | 816.0 | 28.5 | 13.1 | 0.5028 |
# Removes any rows beyond the 9356th index (the last valid data point).
air_quality_data = air_quality_data.head(9357)
# printing the first 5 rows of the dataset
air_quality_data.head()
| Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10/03/2004 | 18.00.00 | 2.6 | 1360.0 | 150.0 | 11.9 | 1046.0 | 166.0 | 1056.0 | 113.0 | 1692.0 | 1268.0 | 13.6 | 48.9 | 0.7578 |
| 1 | 10/03/2004 | 19.00.00 | 2.0 | 1292.0 | 112.0 | 9.4 | 955.0 | 103.0 | 1174.0 | 92.0 | 1559.0 | 972.0 | 13.3 | 47.7 | 0.7255 |
| 2 | 10/03/2004 | 20.00.00 | 2.2 | 1402.0 | 88.0 | 9.0 | 939.0 | 131.0 | 1140.0 | 114.0 | 1555.0 | 1074.0 | 11.9 | 54.0 | 0.7502 |
| 3 | 10/03/2004 | 21.00.00 | 2.2 | 1376.0 | 80.0 | 9.2 | 948.0 | 172.0 | 1092.0 | 122.0 | 1584.0 | 1203.0 | 11.0 | 60.0 | 0.7867 |
| 4 | 10/03/2004 | 22.00.00 | 1.6 | 1272.0 | 51.0 | 6.5 | 836.0 | 131.0 | 1205.0 | 116.0 | 1490.0 | 1110.0 | 11.2 | 59.6 | 0.7888 |
# printing the first 5 rows of the dataset
air_quality_data.tail()
| Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9352 | 04/04/2005 | 10.00.00 | 3.1 | 1314.0 | -200.0 | 13.5 | 1101.0 | 472.0 | 539.0 | 190.0 | 1374.0 | 1729.0 | 21.9 | 29.3 | 0.7568 |
| 9353 | 04/04/2005 | 11.00.00 | 2.4 | 1163.0 | -200.0 | 11.4 | 1027.0 | 353.0 | 604.0 | 179.0 | 1264.0 | 1269.0 | 24.3 | 23.7 | 0.7119 |
| 9354 | 04/04/2005 | 12.00.00 | 2.4 | 1142.0 | -200.0 | 12.4 | 1063.0 | 293.0 | 603.0 | 175.0 | 1241.0 | 1092.0 | 26.9 | 18.3 | 0.6406 |
| 9355 | 04/04/2005 | 13.00.00 | 2.1 | 1003.0 | -200.0 | 9.5 | 961.0 | 235.0 | 702.0 | 156.0 | 1041.0 | 770.0 | 28.3 | 13.5 | 0.5139 |
| 9356 | 04/04/2005 | 14.00.00 | 2.2 | 1071.0 | -200.0 | 11.9 | 1047.0 | 265.0 | 654.0 | 168.0 | 1129.0 | 816.0 | 28.5 | 13.1 | 0.5028 |
# number of rows & columns in the dataset
air_quality_data.shape
(9357, 15)
Step 4 - Gathering information about the data
#getting info about the data
air_quality_data.info()#getting info about the data
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9357 entries, 0 to 9356 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 9357 non-null object 1 Time 9357 non-null object 2 CO(GT) 9357 non-null float64 3 PT08.S1(CO) 9357 non-null float64 4 NMHC(GT) 9357 non-null float64 5 C6H6(GT) 9357 non-null float64 6 PT08.S2(NMHC) 9357 non-null float64 7 NOx(GT) 9357 non-null float64 8 PT08.S3(NOx) 9357 non-null float64 9 NO2(GT) 9357 non-null float64 10 PT08.S4(NO2) 9357 non-null float64 11 PT08.S5(O3) 9357 non-null float64 12 T 9357 non-null float64 13 RH 9357 non-null float64 14 AH 9357 non-null float64 dtypes: float64(13), object(2) memory usage: 1.1+ MB
Step 5 - Checking the number of missing values and replacing with mean
#checking the number of missing values in the data frame
air_quality_data.isnull().sum()
Date 0 Time 0 CO(GT) 0 PT08.S1(CO) 0 NMHC(GT) 0 C6H6(GT) 0 PT08.S2(NMHC) 0 NOx(GT) 0 PT08.S3(NOx) 0 NO2(GT) 0 PT08.S4(NO2) 0 PT08.S5(O3) 0 T 0 RH 0 AH 0 dtype: int64
This shows that there are no missing values in the dataset. But the actual missing values are tagged with the value “-200”. This is described in dataset documentation.
# Check how many times the value -200 appears in the DataFrame
# and sum the occurrences for each column.
air_quality_data.isin([-200]).sum(axis=0)
Date 0 Time 0 CO(GT) 1683 PT08.S1(CO) 366 NMHC(GT) 8443 C6H6(GT) 366 PT08.S2(NMHC) 366 NOx(GT) 1639 PT08.S3(NOx) 366 NO2(GT) 1642 PT08.S4(NO2) 366 PT08.S5(O3) 366 T 366 RH 366 AH 366 dtype: int64
Create a bar plot to visually represent the percentage of missing values for each column.
# Check how many times the value -200 appears in the DataFrame
missing_values_count = air_quality_data.isin([-200]).sum(axis=0)
# Calculate the percentage of missing values for each column
missing_values_percentage = (missing_values_count / len(air_quality_data)) * 100
# Create a DataFrame for better plotting with seaborn
missing_data = pd.DataFrame({
'Column': missing_values_percentage.index,
'Percentage': missing_values_percentage.values
})
# Create a bar plot for missing values percentage using seaborn
plt.figure(figsize=(10, 6)) # Adjust the figure size
sns.barplot(data=missing_data, x='Column', y='Percentage', color='lightcoral')
plt.title('Missing Values Percentage by Column')
plt.xlabel('Columns')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45) # Rotate x labels for better readability
plt.ylim(0, 100) # Set y-axis limit for better visibility
plt.tight_layout() # Adjust spacing between plots
plt.show() # Show the plot
Handling missing values
Convert all - 200 to NaN Replace all NaN values with mean of that specific column
air_quality_data = air_quality_data.replace(to_replace=-200, value = np.NaN)
#checking the number of missing values in the data frame
air_quality_data.isnull().sum()
Date 0 Time 0 CO(GT) 1683 PT08.S1(CO) 366 NMHC(GT) 8443 C6H6(GT) 366 PT08.S2(NMHC) 366 NOx(GT) 1639 PT08.S3(NOx) 366 NO2(GT) 1642 PT08.S4(NO2) 366 PT08.S5(O3) 366 T 366 RH 366 AH 366 dtype: int64
This shows the actual number of missing values
# printing the last 5 rows of the dataset
air_quality_data.tail()
| Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9352 | 04/04/2005 | 10.00.00 | 3.1 | 1314.0 | NaN | 13.5 | 1101.0 | 472.0 | 539.0 | 190.0 | 1374.0 | 1729.0 | 21.9 | 29.3 | 0.7568 |
| 9353 | 04/04/2005 | 11.00.00 | 2.4 | 1163.0 | NaN | 11.4 | 1027.0 | 353.0 | 604.0 | 179.0 | 1264.0 | 1269.0 | 24.3 | 23.7 | 0.7119 |
| 9354 | 04/04/2005 | 12.00.00 | 2.4 | 1142.0 | NaN | 12.4 | 1063.0 | 293.0 | 603.0 | 175.0 | 1241.0 | 1092.0 | 26.9 | 18.3 | 0.6406 |
| 9355 | 04/04/2005 | 13.00.00 | 2.1 | 1003.0 | NaN | 9.5 | 961.0 | 235.0 | 702.0 | 156.0 | 1041.0 | 770.0 | 28.3 | 13.5 | 0.5139 |
| 9356 | 04/04/2005 | 14.00.00 | 2.2 | 1071.0 | NaN | 11.9 | 1047.0 | 265.0 | 654.0 | 168.0 | 1129.0 | 816.0 | 28.5 | 13.1 | 0.5028 |
Find the Mean of each column
mean_values = air_quality_data.select_dtypes(include='number').mean()
print(mean_values)
CO(GT) 2.152750 PT08.S1(CO) 1099.833166 NMHC(GT) 218.811816 C6H6(GT) 10.083105 PT08.S2(NMHC) 939.153376 NOx(GT) 246.896735 PT08.S3(NOx) 835.493605 NO2(GT) 113.091251 PT08.S4(NO2) 1456.264598 PT08.S5(O3) 1022.906128 T 18.317829 RH 49.234201 AH 1.025530 dtype: float64
# Replacing missing values with the mean value for numeric columns only
numeric_columns = air_quality_data.select_dtypes(include='number')
air_quality_data[numeric_columns.columns] = numeric_columns.fillna(numeric_columns.mean())
# printing the last 5 rows of the dataset
air_quality_data.tail()
| Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9352 | 04/04/2005 | 10.00.00 | 3.1 | 1314.0 | 218.811816 | 13.5 | 1101.0 | 472.0 | 539.0 | 190.0 | 1374.0 | 1729.0 | 21.9 | 29.3 | 0.7568 |
| 9353 | 04/04/2005 | 11.00.00 | 2.4 | 1163.0 | 218.811816 | 11.4 | 1027.0 | 353.0 | 604.0 | 179.0 | 1264.0 | 1269.0 | 24.3 | 23.7 | 0.7119 |
| 9354 | 04/04/2005 | 12.00.00 | 2.4 | 1142.0 | 218.811816 | 12.4 | 1063.0 | 293.0 | 603.0 | 175.0 | 1241.0 | 1092.0 | 26.9 | 18.3 | 0.6406 |
| 9355 | 04/04/2005 | 13.00.00 | 2.1 | 1003.0 | 218.811816 | 9.5 | 961.0 | 235.0 | 702.0 | 156.0 | 1041.0 | 770.0 | 28.3 | 13.5 | 0.5139 |
| 9356 | 04/04/2005 | 14.00.00 | 2.2 | 1071.0 | 218.811816 | 11.9 | 1047.0 | 265.0 | 654.0 | 168.0 | 1129.0 | 816.0 | 28.5 | 13.1 | 0.5028 |
#checking the number of missing values in the data frame
air_quality_data.isnull().sum()
Date 0 Time 0 CO(GT) 0 PT08.S1(CO) 0 NMHC(GT) 0 C6H6(GT) 0 PT08.S2(NMHC) 0 NOx(GT) 0 PT08.S3(NOx) 0 NO2(GT) 0 PT08.S4(NO2) 0 PT08.S5(O3) 0 T 0 RH 0 AH 0 dtype: int64
Removing Outliers
# Use bar plots for non-numeric columns like 'Date' and 'Time',
# and apply box plots and histograms for numeric data.
def generate_boxplot(data, column_name):
plt.figure(figsize=(8, 6))
sns.boxplot(y=data[column_name])
# Customize the y-axis tick labels
plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
plt.ylabel(column_name)
plt.title(f'Box Plot of {column_name}')
plt.show()
def generate_histogram(data, column_name):
plt.figure(figsize=(8, 6))
sns.histplot(data[column_name], bins=20, kde=True)
# Customize the y-axis tick labels
plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Histogram of {column_name}')
plt.show()
# Define a function to generate a bar chart for a specific column
def generate_bar_chart(data, column_name):
# For date columns, convert to datetime if not already
if column_name == 'Date':
data[column_name] = pd.to_datetime(data[column_name], format='%d/%m/%Y')
# Group by month and count occurrences
data_counts = data[column_name].dt.to_period('M').value_counts().sort_index()
else:
data_counts = data[column_name].value_counts().sort_index()
# Create a bar plot using Seaborn
plt.figure(figsize=(8, 4)) # Adjust the figure size for an even smaller graph
sns.barplot(x=data_counts.index.astype(str), y=data_counts.values) # Convert index to string for better x-axis labels
# Customize the plot
plt.xlabel(column_name)
plt.ylabel('Count')
plt.title(f'Distribution of {column_name}')
plt.xticks(rotation=90) # Rotate x-axis labels for better readability
plt.ylim(0, data_counts.max() * 1.1) # Set y-axis limits for better visibility
plt.gca().get_yaxis().get_major_formatter().set_scientific(False) # Disable scientific notation
plt.show()
return data_counts # Return the counts for further analysis
- Column Date
# Generate the bar chart and get data_counts
data_counts = generate_bar_chart(air_quality_data, 'Date')
# Display the data counts
print("Data Counts by Month:")
print(data_counts)
Data Counts by Month: Date 2004-03 510 2004-04 720 2004-05 744 2004-06 720 2004-07 744 2004-08 744 2004-09 720 2004-10 744 2004-11 720 2004-12 744 2005-01 744 2005-02 672 2005-03 744 2005-04 87 Freq: M, Name: count, dtype: int64
# Calculate IQR
Q1 = np.percentile(data_counts.values, 25) # First quartile
Q3 = np.percentile(data_counts.values, 75) # Third quartile
IQR = Q3 - Q1 # Interquartile range
# Determine outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Identify outliers
outliers = data_counts[data_counts < lower_bound].index.tolist() + data_counts[data_counts > upper_bound].index.tolist()
print("Outliers detected:", outliers)
Outliers detected: [Period('2004-03', 'M'), Period('2005-02', 'M'), Period('2005-04', 'M')]
# Remove the outlier for April 2005
air_quality_data = air_quality_data[~(air_quality_data['Date'].dt.to_period('M') == '2005-04')]
# Regenerate the bar chart to see the updated distribution
updated_data_counts = generate_bar_chart(air_quality_data, 'Date')
print("Updated Data Counts by Month:")
print(updated_data_counts)
Updated Data Counts by Month: Date 2004-03 510 2004-04 720 2004-05 744 2004-06 720 2004-07 744 2004-08 744 2004-09 720 2004-10 744 2004-11 720 2004-12 744 2005-01 744 2005-02 672 2005-03 744 Freq: M, Name: count, dtype: int64
- Column Time
# Step 1: Convert Time column to string and replace dots with colons
air_quality_data['Time'] = air_quality_data['Time'].astype(str).str.replace('.', ':') # Replace '.' with ':'
air_quality_data['Time'] = pd.to_datetime(air_quality_data['Time'], format='%H:%M:%S', errors='coerce').dt.time
# Step 2: Convert time to seconds
air_quality_data['Time_in_seconds'] = air_quality_data['Time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)
# Step 3: Inspect unique time values
print("Unique Time Values:")
print(air_quality_data['Time'].unique())
# Step 4: Check converted time values
min_time = air_quality_data['Time_in_seconds'].min()
max_time = air_quality_data['Time_in_seconds'].max()
print("Min Time in seconds:", min_time)
print("Max Time in seconds:", max_time)
# Step 5: Adjust outlier detection method
mean = air_quality_data['Time_in_seconds'].mean()
std_dev = air_quality_data['Time_in_seconds'].std()
# Set outlier thresholds (e.g., 2 standard deviations from the mean)
lower_bound = mean - 2 * std_dev
upper_bound = mean + 2 * std_dev
# Identify outliers
outliers = air_quality_data[(air_quality_data['Time_in_seconds'] < lower_bound) | (air_quality_data['Time_in_seconds'] > upper_bound)]
# Step 6: Display results
if not outliers.empty:
print("Outliers detected in Time after adjusting thresholds:")
print(outliers)
else:
print("No outliers detected in Time.")
Unique Time Values: [datetime.time(18, 0) datetime.time(19, 0) datetime.time(20, 0) datetime.time(21, 0) datetime.time(22, 0) datetime.time(23, 0) datetime.time(0, 0) datetime.time(1, 0) datetime.time(2, 0) datetime.time(3, 0) datetime.time(4, 0) datetime.time(5, 0) datetime.time(6, 0) datetime.time(7, 0) datetime.time(8, 0) datetime.time(9, 0) datetime.time(10, 0) datetime.time(11, 0) datetime.time(12, 0) datetime.time(13, 0) datetime.time(14, 0) datetime.time(15, 0) datetime.time(16, 0) datetime.time(17, 0)] Min Time in seconds: 0 Max Time in seconds: 82800 No outliers detected in Time.
# The Function to Generate Plots
def generate_plots(data, column_name):
# Calculate quartiles and IQR
Q1 = data[column_name].quantile(0.25)
Q3 = data[column_name].quantile(0.75)
IQR = Q3 - Q1
# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Calculate non-outlier min/max
non_outlier_max = data[data[column_name] <= upper_bound][column_name].max()
non_outlier_min = data[data[column_name] >= lower_bound][column_name].min()
# Print calculated values
print(f"Q1 (25th percentile): {Q1}")
print(f"Q3 (75th percentile): {Q3}")
print(f"IQR: {IQR}")
print(f"Lower bound for outliers: {lower_bound}")
print(f"Upper bound for outliers: {upper_bound}")
print(f"Minimum value that is not an outlier: {non_outlier_min}")
# print(f"Maximum value that is not an outlier: {non_outlier_max}")
# Plotting
plt.figure(figsize=(8, 4)) # Adjust the figure size for a smaller graph
# Boxplot
plt.subplot(1, 2, 1)
sns.boxplot(y=data[column_name])
plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
plt.ylabel(column_name)
plt.title(f'Box Plot of {column_name}')
# Histogram
plt.subplot(1, 2, 2)
sns.histplot(data[column_name], bins=20, kde=True)
plt.gca().get_yaxis().get_major_formatter().set_scientific(False)
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.title(f'Histogram of {column_name}')
plt.tight_layout()
plt.show()
air_quality_data_copy = air_quality_data
air_quality_data = air_quality_data_copy
# Regenerate the bar chart to see the updated distribution
updated_data_counts = generate_bar_chart(air_quality_data, 'Date')
print("Updated Data Counts by Month:")
print(updated_data_counts)
Updated Data Counts by Month: Date 2004-03 510 2004-04 720 2004-05 744 2004-06 720 2004-07 744 2004-08 744 2004-09 720 2004-10 744 2004-11 720 2004-12 744 2005-01 744 2005-02 672 2005-03 744 Freq: M, Name: count, dtype: int64
- Column: CO(GT)
# Generate plots for 'CO(GT)' before removing outliers
generate_plots(air_quality_data, 'CO(GT)')
# Filter the dataset to remove outliers
air_quality_data = air_quality_data[(air_quality_data['CO(GT)'] <= 3.7)]
# Generate plots for 'CO(GT)' after removing outliers
print("Dataset shape after removing outliers:", air_quality_data.shape)
generate_plots(air_quality_data, 'CO(GT)')
Q1 (25th percentile): 1.3 Q3 (75th percentile): 2.6 IQR: 1.3 Lower bound for outliers: -0.6500000000000001 Upper bound for outliers: 4.550000000000001 Minimum value that is not an outlier: 0.1
Dataset shape after removing outliers: (8242, 16) Q1 (25th percentile): 1.1 Q3 (75th percentile): 2.2 IQR: 1.1 Lower bound for outliers: -0.55 Upper bound for outliers: 3.8500000000000005 Minimum value that is not an outlier: 0.1
- Column: PT08.S1(CO)
# Generate plots for 'PT08.S1(CO)' before removing outliers
generate_plots(air_quality_data, 'PT08.S1(CO)')
# Filter the dataset to remove outliers
air_quality_data = air_quality_data[(air_quality_data['PT08.S1(CO)'] >= 689.0) & (air_quality_data['PT08.S1(CO)'] <= 1503.0)]
# Generate plots for 'PT08.S1(CO)' after removing outliers
generate_plots(air_quality_data, 'PT08.S1(CO)')
Q1 (25th percentile): 929.0 Q3 (75th percentile): 1163.0 IQR: 234.0 Lower bound for outliers: 578.0 Upper bound for outliers: 1514.0 Minimum value that is not an outlier: 647.0
Q1 (25th percentile): 927.0 Q3 (75th percentile): 1158.0 IQR: 231.0 Lower bound for outliers: 580.5 Upper bound for outliers: 1504.5 Minimum value that is not an outlier: 689.0
- Column: NMHC(GT)
# Generate plots for 'NMHC(GT)' before removing outliers
generate_plots(air_quality_data, 'NMHC(GT)')
# Filter the dataset to remove outliers based on specified bounds
air_quality_data = air_quality_data[(air_quality_data['NMHC(GT)'] <= 215)]
# Generate plots for 'NMHC(GT)' after removing outliers
generate_plots(air_quality_data, 'NMHC(GT)')
Q1 (25th percentile): 218.81181619256017 Q3 (75th percentile): 218.81181619256017 IQR: 0.0 Lower bound for outliers: 218.81181619256017 Upper bound for outliers: 218.81181619256017 Minimum value that is not an outlier: 218.81181619256017
Q1 (25th percentile): 50.0 Q3 (75th percentile): 140.25 IQR: 90.25 Lower bound for outliers: -85.375 Upper bound for outliers: 275.625 Minimum value that is not an outlier: 7.0
- Column: C6H6(GT)
# Generate plots for C6H6(GT)
generate_plots(air_quality_data, 'C6H6(GT)')
Q1 (25th percentile): 3.0 Q3 (75th percentile): 8.6 IQR: 5.6 Lower bound for outliers: -5.399999999999999 Upper bound for outliers: 17.0 Minimum value that is not an outlier: 0.5
- Column: PT08.S2(NMHC)
# Generate plots for 'PT08.S2(NMHC)'
generate_plots(air_quality_data, 'PT08.S2(NMHC)')
Q1 (25th percentile): 653.25 Q3 (75th percentile): 924.0 IQR: 270.75 Lower bound for outliers: 247.125 Upper bound for outliers: 1330.125 Minimum value that is not an outlier: 448.0
- Column: NOx(GT)
# Generate plots for 'NOx(GT)' before removing outliers
generate_plots(air_quality_data, 'NOx(GT)')
# Filter the dataset to remove outliers based on the specified bounds
air_quality_data = air_quality_data[ (air_quality_data['NOx(GT)'] <= 215.0)]
generate_plots(air_quality_data, 'NOx(GT)')
Q1 (25th percentile): 62.0 Q3 (75th percentile): 133.0 IQR: 71.0 Lower bound for outliers: -44.5 Upper bound for outliers: 239.5 Minimum value that is not an outlier: 10.0
Q1 (25th percentile): 60.0 Q3 (75th percentile): 122.0 IQR: 62.0 Lower bound for outliers: -33.0 Upper bound for outliers: 215.0 Minimum value that is not an outlier: 10.0
- Column: PT08.S3(NOx)
# Generate plots for 'PT08.S3(NOx)' before removing outliers
generate_plots(air_quality_data, 'PT08.S3(NOx)')
# Filter the dataset to remove outliers based on the specified bounds
air_quality_data = air_quality_data[(air_quality_data['PT08.S3(NOx)'] >= 689.0) & (air_quality_data['PT08.S3(NOx)'] <= 1643)]
generate_plots(air_quality_data, 'PT08.S3(NOx)')
Q1 (25th percentile): 939.0 Q3 (75th percentile): 1244.0 IQR: 305.0 Lower bound for outliers: 481.5 Upper bound for outliers: 1701.5 Minimum value that is not an outlier: 646.0
Q1 (25th percentile): 937.0 Q3 (75th percentile): 1221.5 IQR: 284.5 Lower bound for outliers: 510.25 Upper bound for outliers: 1648.25 Minimum value that is not an outlier: 689.0
- Column: NO2(GT)
# Generate plots for 'NO2(GT)' before removing outliers
generate_plots(air_quality_data, 'NO2(GT)')
# Filter the dataset to remove outliers based on the specified bounds
air_quality_data = air_quality_data[(air_quality_data['NO2(GT)'] >= 26) & (air_quality_data['NO2(GT)'] <= 138.0)]
generate_plots(air_quality_data, 'NO2(GT)')
Q1 (25th percentile): 69.0 Q3 (75th percentile): 98.0 IQR: 29.0 Lower bound for outliers: 25.5 Upper bound for outliers: 141.5 Minimum value that is not an outlier: 28.0
Q1 (25th percentile): 69.0 Q3 (75th percentile): 98.0 IQR: 29.0 Lower bound for outliers: 25.5 Upper bound for outliers: 141.5 Minimum value that is not an outlier: 28.0
- Column: PT08.S4(NO2)
# Generate plots for 'PT08.S4(NO2)' before removing outliers
generate_plots(air_quality_data, 'PT08.S4(NO2)')
# Filter the dataset to remove outliers based on specified bounds
air_quality_data = air_quality_data[(air_quality_data['PT08.S4(NO2)'] >= 1036) & (air_quality_data['PT08.S4(NO2)'] <= 1856)]
generate_plots(air_quality_data, 'PT08.S4(NO2)')
Q1 (25th percentile): 1307.5 Q3 (75th percentile): 1534.5 IQR: 227.0 Lower bound for outliers: 967.0 Upper bound for outliers: 1875.0 Minimum value that is not an outlier: 1036.0
Q1 (25th percentile): 1307.0 Q3 (75th percentile): 1532.25 IQR: 225.25 Lower bound for outliers: 969.125 Upper bound for outliers: 1870.125 Minimum value that is not an outlier: 1036.0
- Column: PT08.S5(O3)
# Generate plots for 'PT08.S5(O3)' before removing outliers
generate_plots(air_quality_data, 'PT08.S5(O3)')
# Filter the dataset to remove outliers based on specified bounds
air_quality_data = air_quality_data[(air_quality_data['PT08.S5(O3)'] >= 684) & (air_quality_data['PT08.S5(O3)'] <= 1423)]
generate_plots(air_quality_data, 'PT08.S5(O3)')
Q1 (25th percentile): 644.0 Q3 (75th percentile): 1016.75 IQR: 372.75 Lower bound for outliers: 84.875 Upper bound for outliers: 1575.875 Minimum value that is not an outlier: 332.0
Q1 (25th percentile): 806.0 Q3 (75th percentile): 1054.0 IQR: 248.0 Lower bound for outliers: 434.0 Upper bound for outliers: 1426.0 Minimum value that is not an outlier: 684.0
- Column: T
# Generate plots for 'T' before removing outliers
generate_plots(air_quality_data, 'T')
# Filter the dataset to remove outliers based on specified bounds
air_quality_data = air_quality_data[(air_quality_data['T'] <= 24.8)]
generate_plots(air_quality_data, 'T')
Q1 (25th percentile): 11.0 Q3 (75th percentile): 16.9 IQR: 5.899999999999999 Lower bound for outliers: 2.150000000000002 Upper bound for outliers: 25.749999999999996 Minimum value that is not an outlier: 6.3
Q1 (25th percentile): 11.0 Q3 (75th percentile): 16.6 IQR: 5.600000000000001 Lower bound for outliers: 2.599999999999998 Upper bound for outliers: 25.000000000000004 Minimum value that is not an outlier: 6.3
- Column: RH
# Generate plots for 'RH'
generate_plots(air_quality_data, 'RH')
Q1 (25th percentile): 43.5 Q3 (75th percentile): 64.0 IQR: 20.5 Lower bound for outliers: 12.75 Upper bound for outliers: 94.75 Minimum value that is not an outlier: 22.9
- Column: AH
# Generate plots for 'AH' before removing outliers
generate_plots(air_quality_data, 'AH')
# Filter the dataset to remove outliers based on specified bounds
air_quality_data = air_quality_data[(air_quality_data['AH'] <= 1.2016)]
generate_plots(air_quality_data, 'AH')
Q1 (25th percentile): 0.7277 Q3 (75th percentile): 0.9331 IQR: 0.20540000000000003 Lower bound for outliers: 0.4196 Upper bound for outliers: 1.2412 Minimum value that is not an outlier: 0.5037
Q1 (25th percentile): 0.726425 Q3 (75th percentile): 0.922675 IQR: 0.19625000000000004 Lower bound for outliers: 0.43204999999999993 Upper bound for outliers: 1.21705 Minimum value that is not an outlier: 0.5037
-------------------------------Perform Correlation Analysis------------------------------------
print("\nDescriptive Statistics:")
print(air_quality_data.describe())
Descriptive Statistics:
Date CO(GT) PT08.S1(CO) NMHC(GT) \
count 332 332.000000 332.000000 332.000000
mean 2004-03-31 02:57:49.879518080 1.671359 1104.033126 108.141566
min 2004-03-10 00:00:00 0.500000 823.000000 17.000000
25% 2004-03-17 00:00:00 1.200000 1020.000000 65.750000
50% 2004-03-30 00:00:00 1.700000 1098.000000 100.000000
75% 2004-04-10 00:00:00 2.100000 1173.750000 151.000000
max 2004-04-29 00:00:00 3.100000 1478.000000 214.000000
std NaN 0.562613 129.263905 52.195263
C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) \
count 332.000000 332.000000 332.000000 332.000000 332.000000
mean 6.910233 839.345905 105.283133 1037.547962 88.668675
min 1.300000 529.000000 31.000000 770.000000 44.000000
25% 4.600000 743.750000 75.750000 924.000000 74.750000
50% 7.050000 859.000000 106.000000 1006.000000 89.000000
75% 8.925000 937.250000 130.000000 1140.250000 102.000000
max 15.400000 1162.000000 207.000000 1490.000000 131.000000
std 2.894088 129.373294 37.685589 157.580117 19.010536
PT08.S4(NO2) PT08.S5(O3) T RH AH \
count 332.000000 332.000000 332.000000 332.000000 332.000000
mean 1466.639082 939.297812 13.869319 53.591598 0.823194
min 1101.000000 684.000000 6.300000 22.900000 0.503700
25% 1376.750000 798.500000 11.000000 43.075000 0.726425
50% 1456.264598 918.500000 13.550000 55.900000 0.821650
75% 1551.250000 1048.250000 16.625000 63.725000 0.922675
max 1831.000000 1423.000000 24.800000 81.800000 1.201600
std 120.251935 168.215096 3.926728 13.727528 0.138419
Time_in_seconds
count 332.000000
mean 36390.361446
min 0.000000
25% 14400.000000
50% 32400.000000
75% 57600.000000
max 82800.000000
std 26979.215966
# Using seaborn checking the correlation between the features
sns.pairplot(air_quality_data)
<seaborn.axisgrid.PairGrid at 0x32e895b80>
Lets check the correlation of all features with Multivariant
# Select only numeric columns
numeric_data = air_quality_data.select_dtypes(include=[np.number])
# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()
# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)
# Set up the matplotlib figure
plt.figure(figsize=(12, 10))
# Create a heatmap for the correlation matrix
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
# Title and labels
plt.title('Correlation Matrix of Air Quality Features')
plt.show()
Correlation Matrix:
CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) \
CO(GT) 1.000000 0.702448 0.651644 0.764485 0.753672
PT08.S1(CO) 0.702448 1.000000 0.494487 0.782431 0.792730
NMHC(GT) 0.651644 0.494487 1.000000 0.821488 0.827793
C6H6(GT) 0.764485 0.782431 0.821488 1.000000 0.992076
PT08.S2(NMHC) 0.753672 0.792730 0.827793 0.992076 1.000000
NOx(GT) 0.746675 0.762217 0.668361 0.823385 0.833207
PT08.S3(NOx) -0.519027 -0.530716 -0.813310 -0.796526 -0.818349
NO2(GT) 0.719529 0.763548 0.593645 0.773642 0.788994
PT08.S4(NO2) 0.661246 0.815774 0.664803 0.846618 0.854520
PT08.S5(O3) 0.478330 0.741535 0.350762 0.612279 0.610970
T 0.373794 0.358496 0.576868 0.599520 0.578961
RH -0.335964 -0.233559 -0.483865 -0.497006 -0.483847
AH -0.051088 0.117911 0.034028 0.042551 0.033217
Time_in_seconds 0.392075 0.327017 0.455701 0.445653 0.454843
NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) \
CO(GT) 0.746675 -0.519027 0.719529 0.661246 0.478330
PT08.S1(CO) 0.762217 -0.530716 0.763548 0.815774 0.741535
NMHC(GT) 0.668361 -0.813310 0.593645 0.664803 0.350762
C6H6(GT) 0.823385 -0.796526 0.773642 0.846618 0.612279
PT08.S2(NMHC) 0.833207 -0.818349 0.788994 0.854520 0.610970
NOx(GT) 1.000000 -0.623748 0.803636 0.736871 0.539520
PT08.S3(NOx) -0.623748 1.000000 -0.540731 -0.758095 -0.596397
NO2(GT) 0.803636 -0.540731 1.000000 0.619613 0.500680
PT08.S4(NO2) 0.736871 -0.758095 0.619613 1.000000 0.726916
PT08.S5(O3) 0.539520 -0.596397 0.500680 0.726916 1.000000
T 0.303004 -0.562159 0.386094 0.462819 0.255911
RH -0.246332 0.281245 -0.395308 -0.155195 0.002802
AH 0.000487 -0.379692 -0.088301 0.393056 0.412575
Time_in_seconds 0.385856 -0.319930 0.524328 0.305156 0.124902
T RH AH Time_in_seconds
CO(GT) 0.373794 -0.335964 -0.051088 0.392075
PT08.S1(CO) 0.358496 -0.233559 0.117911 0.327017
NMHC(GT) 0.576868 -0.483865 0.034028 0.455701
C6H6(GT) 0.599520 -0.497006 0.042551 0.445653
PT08.S2(NMHC) 0.578961 -0.483847 0.033217 0.454843
NOx(GT) 0.303004 -0.246332 0.000487 0.385856
PT08.S3(NOx) -0.562159 0.281245 -0.379692 -0.319930
NO2(GT) 0.386094 -0.395308 -0.088301 0.524328
PT08.S4(NO2) 0.462819 -0.155195 0.393056 0.305156
PT08.S5(O3) 0.255911 0.002802 0.412575 0.124902
T 1.000000 -0.799891 0.139286 0.327606
RH -0.799891 1.000000 0.463288 -0.381811
AH 0.139286 0.463288 1.000000 -0.138448
Time_in_seconds 0.327606 -0.381811 -0.138448 1.000000
----------------Linear Regression-----------------------------------
# Step 1: Data Preparation
# Select features (X) and target variable (y)
# Example: Predicting RH based on other sensor responses and environmental factors
features = ['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'T', 'AH']
target = 'RH' # Set target variable to Relative Humidity (RH)
X = air_quality_data[features] # Features DataFrame
y = air_quality_data[target] # Target variable
# Now X contains the features and y contains the target variable RH.
# Step 2: Data Splitting
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Now X_train and y_train can be used to train the model,
# while X_test and y_test can be used for evaluating the model's performance.
# Step 3: Model Training
# Create a linear regression model
modelr = LinearRegression()
# Train the model using the training data
modelr.fit(X_train, y_train)
# The model is now trained to predict RH based on the selected features.
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
# Step 4: Model Prediction
# Predict RH using the test data
y_pred = modelr.predict(X_test)
# y_pred now contains the predicted values of RH based on the test features.
# Step 5: Model Evaluation
# Calculate Mean Squared Error and R^2 Score for RH predictions
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print evaluation metrics
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)
Mean Squared Error: 3.689716667782008 R^2 Score: 0.9784059406354653
# Optional: Visualize Predictions
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--') # Identity line
plt.xlabel('Actual RH')
plt.ylabel('Predicted RH')
plt.title('Actual vs Predicted RH')
plt.show()
# Step 6: Examine the model coefficients
coefficients = pd.DataFrame(modelr.coef_, X.columns, columns=['Coefficient'])
print(coefficients)
Coefficient CO(GT) 0.759028 PT08.S1(CO) -0.002882 NMHC(GT) 0.005214 C6H6(GT) -0.901792 PT08.S2(NMHC) 0.023927 NOx(GT) 0.015865 PT08.S3(NOx) 0.009813 T -3.040923 AH 62.548884
-------------------------------Forecasting------------------------------------ Forecasting with FB Prophet Algorithm FB Prophet Documentation : https://facebook.github.io/prophet/docs/quick_start.html
# converting the date from DD/MM/YYYY to YYYY-MM-DD
date_info = pd.to_datetime(air_quality_data['Date'])
print(date_info)
0 2004-03-10
1 2004-03-10
2 2004-03-10
3 2004-03-10
4 2004-03-10
...
1183 2004-04-29
1184 2004-04-29
1186 2004-04-29
1187 2004-04-29
1188 2004-04-29
Name: Date, Length: 332, dtype: datetime64[ns]
# Extract the 'Time' column from the DataFrame
time_info = air_quality_data['Time']
print(time_info)
0 18:00:00
1 19:00:00
2 20:00:00
3 21:00:00
4 22:00:00
...
1183 01:00:00
1184 02:00:00
1186 04:00:00
1187 05:00:00
1188 06:00:00
Name: Time, Length: 332, dtype: object
# Ensure time_info is a string before applying the replacement
# Replace periods in the time strings with colons
time_info = time_info.astype(str).apply(lambda x: x.replace('.', ':'))
print(time_info)
0 18:00:00
1 19:00:00
2 20:00:00
3 21:00:00
4 22:00:00
...
1183 01:00:00
1184 02:00:00
1186 04:00:00
1187 05:00:00
1188 06:00:00
Name: Time, Length: 332, dtype: object
# Print the type of the 'time_info' & 'date_info' variable
print(type(date_info))
print(type(time_info))
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
# Combining two series into a pandas DataFrame
date_time = pd.concat([date_info, time_info], axis=1)
date_time.head()
| Date | Time | |
|---|---|---|
| 0 | 2004-03-10 | 18:00:00 |
| 1 | 2004-03-10 | 19:00:00 |
| 2 | 2004-03-10 | 20:00:00 |
| 3 | 2004-03-10 | 21:00:00 |
| 4 | 2004-03-10 | 22:00:00 |
date_time.shape
(332, 2)
# combining date and time
date_time['ds'] = date_time['Date'].astype(str) + ' ' + date_time['Time'].astype(str)
date_time.head()
| Date | Time | ds | |
|---|---|---|---|
| 0 | 2004-03-10 | 18:00:00 | 2004-03-10 18:00:00 |
| 1 | 2004-03-10 | 19:00:00 | 2004-03-10 19:00:00 |
| 2 | 2004-03-10 | 20:00:00 | 2004-03-10 20:00:00 |
| 3 | 2004-03-10 | 21:00:00 | 2004-03-10 21:00:00 |
| 4 | 2004-03-10 | 22:00:00 | 2004-03-10 22:00:00 |
date_time.info()
<class 'pandas.core.frame.DataFrame'> Index: 332 entries, 0 to 1188 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 332 non-null datetime64[ns] 1 Time 332 non-null object 2 ds 332 non-null object dtypes: datetime64[ns](1), object(2) memory usage: 10.4+ KB
# Create an empty DataFrame named 'data'
data = pd.DataFrame()
# Convert the 'ds' column in the date_time DataFrame to datetime format and assign it to the 'ds' column in the 'data' DataFrame
data['ds'] = pd.to_datetime(date_time['ds'])
data.head()
| ds | |
|---|---|
| 0 | 2004-03-10 18:00:00 |
| 1 | 2004-03-10 19:00:00 |
| 2 | 2004-03-10 20:00:00 |
| 3 | 2004-03-10 21:00:00 |
| 4 | 2004-03-10 22:00:00 |
# Assigning the values from the 'RH' column of the air_quality_data DataFrame to a new column 'y' in the data DataFrame.
# In the context of FB Prophet, 'y' is the target variable that the model will predict, representing relative humidity percentage.
data['y'] = air_quality_data['RH']
data.head()
| ds | y | |
|---|---|---|
| 0 | 2004-03-10 18:00:00 | 48.9 |
| 1 | 2004-03-10 19:00:00 | 47.7 |
| 2 | 2004-03-10 20:00:00 | 54.0 |
| 3 | 2004-03-10 21:00:00 | 60.0 |
| 4 | 2004-03-10 22:00:00 | 59.6 |
!pip install prophet
Requirement already satisfied: prophet in /opt/anaconda3/lib/python3.12/site-packages (1.1.6) Requirement already satisfied: cmdstanpy>=1.0.4 in /opt/anaconda3/lib/python3.12/site-packages (from prophet) (1.2.4) Requirement already satisfied: numpy>=1.15.4 in /opt/anaconda3/lib/python3.12/site-packages (from prophet) (1.26.4) Requirement already satisfied: matplotlib>=2.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from prophet) (3.8.4) Requirement already satisfied: pandas>=1.0.4 in /opt/anaconda3/lib/python3.12/site-packages (from prophet) (2.2.3) Requirement already satisfied: holidays<1,>=0.25 in /opt/anaconda3/lib/python3.12/site-packages (from prophet) (0.59) Requirement already satisfied: tqdm>=4.36.1 in /opt/anaconda3/lib/python3.12/site-packages (from prophet) (4.66.4) Requirement already satisfied: importlib-resources in /opt/anaconda3/lib/python3.12/site-packages (from prophet) (6.4.5) Requirement already satisfied: stanio<2.0.0,>=0.4.0 in /opt/anaconda3/lib/python3.12/site-packages (from cmdstanpy>=1.0.4->prophet) (0.5.1) Requirement already satisfied: python-dateutil in /opt/anaconda3/lib/python3.12/site-packages (from holidays<1,>=0.25->prophet) (2.9.0.post0) Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib>=2.0.0->prophet) (1.2.0) Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib>=2.0.0->prophet) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib>=2.0.0->prophet) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib>=2.0.0->prophet) (1.4.4) Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib>=2.0.0->prophet) (23.2) Requirement already satisfied: pillow>=8 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib>=2.0.0->prophet) (10.3.0) Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib>=2.0.0->prophet) (3.0.9) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas>=1.0.4->prophet) (2024.1) Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas>=1.0.4->prophet) (2023.3) Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil->holidays<1,>=0.25->prophet) (1.16.0)
from prophet import Prophet
#training the Prophet model
model = Prophet()
model.fit(data)
03:02:14 - cmdstanpy - INFO - Chain [1] start processing 03:02:14 - cmdstanpy - INFO - Chain [1] done processing
<prophet.forecaster.Prophet at 0x34b2c7bf0>
# Create a future DataFrame for predictions.
# The 'make_future_dataframe' method generates dates for future predictions.
# 'periods=365' specifies that we want to predict for the next 365 hours.
# 'freq='H'' indicates that the frequency of the predictions will be hourly.
future = model.make_future_dataframe(periods=900, freq='h')
# Display the last five rows of the future DataFrame to verify the generated dates.
future.tail()
| ds | |
|---|---|
| 1227 | 2004-06-05 14:00:00 |
| 1228 | 2004-06-05 15:00:00 |
| 1229 | 2004-06-05 16:00:00 |
| 1230 | 2004-06-05 17:00:00 |
| 1231 | 2004-06-05 18:00:00 |
# Make predictions using the trained model on the future DataFrame
forecast = model.predict(future)
# Display the last few rows of the forecast, showing the relevant columns
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
| ds | yhat | yhat_lower | yhat_upper | |
|---|---|---|---|---|
| 1227 | 2004-06-05 14:00:00 | 20.705931 | 8.922769 | 32.974596 |
| 1228 | 2004-06-05 15:00:00 | 21.781910 | 9.182241 | 33.123846 |
| 1229 | 2004-06-05 16:00:00 | 21.048427 | 8.653534 | 33.490366 |
| 1230 | 2004-06-05 17:00:00 | 19.691495 | 7.477750 | 31.829399 |
| 1231 | 2004-06-05 18:00:00 | 19.686417 | 7.123190 | 31.410353 |
# Generate a plot of the forecasted values using the trained model.
# The 'plot' method visualizes the predicted values along with the actual historical data.
# This plot will also display the confidence intervals (yhat_lower and yhat_upper) for the predictions.
fig1 = model.plot(forecast)
# Generate a plot of the individual components of the forecast.
# The 'plot_components' method visualizes the trend and seasonal effects in the data.
# This allows for a better understanding of how each component contributes to the overall forecast.
fig2 = model.plot_components(forecast)